In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
In [3]:
df = pd.read_csv(r"C:\Users\Iddrisu Bachokun\Desktop\Python\Data\parkinsin\parkinsons.csv")
pd.set_option('display.max_columns',None)
df.head()
Out[3]:
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) Shimmer:APQ3 Shimmer:APQ5 MDVP:APQ Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 phon_R01_S01_1 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 0.426 0.02182 0.03130 0.02971 0.06545 0.02211 21.033 1 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654
1 phon_R01_S01_2 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 0.626 0.03134 0.04518 0.04368 0.09403 0.01929 19.085 1 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674
2 phon_R01_S01_3 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 0.482 0.02757 0.03858 0.03590 0.08270 0.01309 20.651 1 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634
3 phon_R01_S01_4 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 0.517 0.02924 0.04005 0.03772 0.08771 0.01353 20.644 1 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975
4 phon_R01_S01_5 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 0.584 0.03490 0.04825 0.04465 0.10470 0.01767 19.649 1 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335
In [6]:
df1 = pd.read_csv(r"C:\Users\Iddrisu Bachokun\Desktop\Python\Data\parkinsin\parkinsons.csv",usecols=["MDVP:Fo(Hz)","MDVP:Fhi(Hz)","MDVP:Flo(Hz)","MDVP:Shimmer(dB)",
                                                                                                    "Shimmer:APQ3","Shimmer:APQ5","NHR","HNR","status"])
df1.head()
Out[6]:
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Shimmer(dB) Shimmer:APQ3 Shimmer:APQ5 NHR HNR status
0 119.992 157.302 74.997 0.426 0.02182 0.03130 0.02211 21.033 1
1 122.400 148.650 113.819 0.626 0.03134 0.04518 0.01929 19.085 1
2 116.682 131.111 111.555 0.482 0.02757 0.03858 0.01309 20.651 1
3 116.676 137.871 111.366 0.517 0.02924 0.04005 0.01353 20.644 1
4 116.014 141.781 110.655 0.584 0.03490 0.04825 0.01767 19.649 1
In [5]:
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   MDVP:Fo(Hz)       195 non-null    float64
 1   MDVP:Fhi(Hz)      195 non-null    float64
 2   MDVP:Flo(Hz)      195 non-null    float64
 3   MDVP:Shimmer(dB)  195 non-null    float64
 4   Shimmer:APQ3      195 non-null    float64
 5   Shimmer:APQ5      195 non-null    float64
 6   NHR               195 non-null    float64
 7   HNR               195 non-null    float64
 8   status            195 non-null    int64  
 9   D2                195 non-null    float64
 10  PPE               195 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 16.9 KB
In [8]:
df1.describe()
Out[8]:
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Shimmer(dB) Shimmer:APQ3 Shimmer:APQ5 NHR HNR status
count 195.000000 195.000000 195.000000 195.000000 195.000000 195.000000 195.000000 195.000000 195.000000
mean 154.228641 197.104918 116.324631 0.282251 0.015664 0.017878 0.024847 21.885974 0.753846
std 41.390065 91.491548 43.521413 0.194877 0.010153 0.012024 0.040418 4.425764 0.431878
min 88.333000 102.145000 65.476000 0.085000 0.004550 0.005700 0.000650 8.441000 0.000000
25% 117.572000 134.862500 84.291000 0.148500 0.008245 0.009580 0.005925 19.198000 1.000000
50% 148.790000 175.829000 104.315000 0.221000 0.012790 0.013470 0.011660 22.085000 1.000000
75% 182.769000 224.205500 140.018500 0.350000 0.020265 0.022380 0.025640 25.075500 1.000000
max 260.105000 592.030000 239.170000 1.302000 0.056470 0.079400 0.314820 33.047000 1.000000
In [9]:
df1.isna().sum()
Out[9]:
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
NHR                 0
HNR                 0
status              0
dtype: int64
In [12]:
df1.status.unique()
Out[12]:
array([1, 0], dtype=int64)
In [15]:
df1['status'].value_counts()
Out[15]:
1    147
0     48
Name: status, dtype: int64

Train, validation, test datasets

In [30]:
from imblearn.over_sampling import RandomOverSampler
In [18]:
train , valid, test = np.split(df1.sample(frac=1), [int(0.6*len(df1)),int(0.8*len(df1))])
In [19]:
def scale_dataset(dataframe):
    x = dataframe[dataframe.cols[:-1]].values
    y = dataframe[dataframe.cols[-1]].values
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    data = np.hstack((x,np.reshape(y,(-1,1))))
    return data, x,y 
In [20]:
print(len(train[train['status']==1])) # gamma
print(len(train[train['status']==0])) # hadron
91
26

Oversampling

We see that the number of hadron is far too small compare with the gamma. This will poss a problem when we train our data in this form. We threrefore need to oversample the data to bring the smaller data to the same length as the longer data. This is very usefull whwen you don't haveenough data, and so the over sample will bump the data up. There is also a situation that will reqiyer undersample. In this situation , the longer sample is undersampled to be of thesame length as the samaller data. This is usefull when you have so much data that undersampling will not compromise your results.

In [37]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
In [38]:
def scale_dataset(dataframe,oversample=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    if oversample:
        ros = RandomOverSampler()
        x , y = ros.fit_resample(x,y)
    data = np.hstack((x,np.reshape(y,(-1,1))))
    return data, x,y 
In [39]:
def scale_dataset(dataframe,oversample=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    
    if oversample:
        ros = RandomOverSampler()
        x , y = ros.fit_resample(x,y)
    data = np.hstack((x,np.reshape(y,(-1,1))))
    return data, x,y 
In [ ]:
 
In [40]:
print(len(train[train['status']==1])) # gamma
print(len(train[train['status']==0])) # hadron
91
26
In [41]:
train, x_train, y_train = scale_dataset(train, oversample=True)
valid, x_valid ,y_valid = scale_dataset(valid, oversample=False)
test, x_test, y_test = scale_dataset(test, oversample=False)
In [42]:
sum(y_train==1)
Out[42]:
91
In [43]:
sum(y_train==0)
Out[43]:
91
In [48]:
for label in df1.columns[:-1] :
    plt.hist(df1[df1["status"]==1][label], color ='blue', label = 'Disease', alpha=0.7, density= True)
    plt.hist(df1[df1["status"]==0][label], color ='red',label = 'No Disease', alpha=0.7, density= True)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
In [50]:
knn_model = KNeighborsClassifier()
knn_model.fit(x_train,y_train)
Out[50]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [51]:
y_pred = knn_model.predict(x_test)
In [52]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.53      0.73      0.62        11
           1       0.88      0.75      0.81        28

    accuracy                           0.74        39
   macro avg       0.70      0.74      0.71        39
weighted avg       0.78      0.74      0.75        39

In [53]:
input_data = (119.992,157.302,74.997,0.426,0.02182,0.03130,0.02211,21.033)
input_data_np = np.asarray(input_data)
input_data_re = input_data_np.reshape(1,-1)
pred = knn_model.predict(input_data_re)
print(pred)
if(pred[0]==0):
    print("The person has no disease")
    
else:
    print("The person has the disease")
[0]
The person has no disease

Naive Bayes

In [54]:
from sklearn.naive_bayes import GaussianNB
In [55]:
nb_model = GaussianNB()
nb_model.fit(x_test,y_test)
Out[55]:
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
In [57]:
y_pred = nb_model.predict(x_test)
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.41      0.82      0.55        11
           1       0.88      0.54      0.67        28

    accuracy                           0.62        39
   macro avg       0.65      0.68      0.61        39
weighted avg       0.75      0.62      0.63        39

Logistic Regression

In [58]:
from sklearn.linear_model import LogisticRegression
In [59]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train,y_train)
Out[59]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [ ]:
y_pred = logistic_model.predict(x_test)
In [60]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.41      0.82      0.55        11
           1       0.88      0.54      0.67        28

    accuracy                           0.62        39
   macro avg       0.65      0.68      0.61        39
weighted avg       0.75      0.62      0.63        39

In [ ]:
 

SVM

In [61]:
from sklearn.svm import SVC
sv_model = SVC()
In [62]:
sv_model.fit(x_train,y_train)
Out[62]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [63]:
y_pred = sv_model.predict(x_test)
In [65]:
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.44      0.73      0.55        11
           1       0.86      0.64      0.73        28

    accuracy                           0.67        39
   macro avg       0.65      0.69      0.64        39
weighted avg       0.74      0.67      0.68        39

Tree

In [66]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
In [67]:
dt_model.fit(x_train,y_train)
Out[67]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [68]:
y_pred = dt_model.predict(x_test)
print(classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           0       0.86      0.55      0.67        11
           1       0.84      0.96      0.90        28

    accuracy                           0.85        39
   macro avg       0.85      0.75      0.78        39
weighted avg       0.85      0.85      0.83        39

Neural Networks

In [70]:
import tensorflow as tf
In [75]:
def plot_history(history):
    fig ,(ax1, ax2)=plt.subplots(1,2,figsize = (10,4))
    ax1.plot(history.history['loss'],label='loss')
    ax1.plot(history.history['val_loss'],label='val_loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Binary_crossentropy')
    #ax1.legend()
    ax1.grid(True)
  
#def plot_accuracy(history):
    ax2.plot(history.history['accuracy'],label='accuracy')
    ax2.plot(history.history['val_accuracy'],label='val_accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    #ax2.legend()
    ax2.grid(True)
    plt.show()
In [76]:
import tensorflow as tf
def train_model(X_train, y_train, num_nodes, dropout_prob,lr, batch_size, epochs):
    nnw_model =tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes,activation='relu',input_shape=(8,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes,activation='relu'),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1,activation='sigmoid')])

    nnw_model.compile(optimizer= tf.keras.optimizers.Adam(lr),loss='binary_crossentropy',
                      metrics=['accuracy'])
# Training the model
    history = nnw_model.fit(
        x_train, y_train, epochs=epochs, batch_size= batch_size, validation_split=0.2, verbose=0
    )
    return nnw_model, history
In [77]:
east_val_loss = list('inf')
least_loss_model = None
epochs = 100
for num_nodes in [16,32,64]:
    for dropout_prob in[0,0.2]:
        for lr in [0.01, 0.005, 0.001]:
            for batch_size in [32,64,128]:
                print(f"{num_nodes} nodes, dropout {dropout_prob},lr {lr},batch size {batch_size}")
                model , history = train_model(x_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
                plot_history(history)
                val_loss= model.evaluate(x_valid, y_valid)
                #if val_loss < least_val_loss:
                least_val_loss = val_loss
                least_loss_model = model
16 nodes, dropout 0,lr 0.01,batch size 32
2/2 [==============================] - 0s 3ms/step - loss: 1.4168 - accuracy: 0.8205
16 nodes, dropout 0,lr 0.01,batch size 64
2/2 [==============================] - 0s 3ms/step - loss: 1.7528 - accuracy: 0.8205
16 nodes, dropout 0,lr 0.01,batch size 128
2/2 [==============================] - 0s 4ms/step - loss: 0.8667 - accuracy: 0.7949
16 nodes, dropout 0,lr 0.005,batch size 32
2/2 [==============================] - 0s 4ms/step - loss: 0.8602 - accuracy: 0.8205
16 nodes, dropout 0,lr 0.005,batch size 64
2/2 [==============================] - 0s 3ms/step - loss: 0.6783 - accuracy: 0.8462
16 nodes, dropout 0,lr 0.005,batch size 128
2/2 [==============================] - 0s 3ms/step - loss: 0.4338 - accuracy: 0.8462
16 nodes, dropout 0,lr 0.001,batch size 32
2/2 [==============================] - 0s 0s/step - loss: 0.4735 - accuracy: 0.7949
16 nodes, dropout 0,lr 0.001,batch size 64
2/2 [==============================] - 0s 3ms/step - loss: 0.3914 - accuracy: 0.7949
16 nodes, dropout 0,lr 0.001,batch size 128
2/2 [==============================] - 0s 0s/step - loss: 0.3944 - accuracy: 0.7949
16 nodes, dropout 0.2,lr 0.01,batch size 32
2/2 [==============================] - 0s 4ms/step - loss: 1.1207 - accuracy: 0.8462
16 nodes, dropout 0.2,lr 0.01,batch size 64
2/2 [==============================] - 0s 4ms/step - loss: 0.5186 - accuracy: 0.8462
16 nodes, dropout 0.2,lr 0.01,batch size 128
2/2 [==============================] - 0s 3ms/step - loss: 0.9787 - accuracy: 0.8718
16 nodes, dropout 0.2,lr 0.005,batch size 32
2/2 [==============================] - 0s 4ms/step - loss: 0.5754 - accuracy: 0.8462
16 nodes, dropout 0.2,lr 0.005,batch size 64
2/2 [==============================] - 0s 3ms/step - loss: 0.5434 - accuracy: 0.8462
16 nodes, dropout 0.2,lr 0.005,batch size 128
2/2 [==============================] - 0s 4ms/step - loss: 0.3699 - accuracy: 0.8205
16 nodes, dropout 0.2,lr 0.001,batch size 32
2/2 [==============================] - 0s 4ms/step - loss: 0.4184 - accuracy: 0.7949
16 nodes, dropout 0.2,lr 0.001,batch size 64
2/2 [==============================] - 0s 4ms/step - loss: 0.4187 - accuracy: 0.7692
16 nodes, dropout 0.2,lr 0.001,batch size 128
2/2 [==============================] - 0s 3ms/step - loss: 0.3923 - accuracy: 0.8205
32 nodes, dropout 0,lr 0.01,batch size 32
2/2 [==============================] - 0s 3ms/step - loss: 2.6784 - accuracy: 0.7949
32 nodes, dropout 0,lr 0.01,batch size 64
2/2 [==============================] - 0s 5ms/step - loss: 2.2335 - accuracy: 0.7949
32 nodes, dropout 0,lr 0.01,batch size 128
2/2 [==============================] - 0s 3ms/step - loss: 0.8360 - accuracy: 0.8462
32 nodes, dropout 0,lr 0.005,batch size 32
2/2 [==============================] - 0s 3ms/step - loss: 1.2914 - accuracy: 0.8205
32 nodes, dropout 0,lr 0.005,batch size 64
2/2 [==============================] - 0s 3ms/step - loss: 1.2933 - accuracy: 0.7949
32 nodes, dropout 0,lr 0.005,batch size 128
2/2 [==============================] - 0s 2ms/step - loss: 0.6945 - accuracy: 0.8205
32 nodes, dropout 0,lr 0.001,batch size 32
2/2 [==============================] - 0s 3ms/step - loss: 0.4411 - accuracy: 0.8205
32 nodes, dropout 0,lr 0.001,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 0.4680 - accuracy: 0.8462
32 nodes, dropout 0,lr 0.001,batch size 128
2/2 [==============================] - 0s 16ms/step - loss: 0.4147 - accuracy: 0.7949
32 nodes, dropout 0.2,lr 0.01,batch size 32
2/2 [==============================] - 0s 3ms/step - loss: 1.7429 - accuracy: 0.8205
32 nodes, dropout 0.2,lr 0.01,batch size 64
2/2 [==============================] - 0s 13ms/step - loss: 0.6865 - accuracy: 0.8462
32 nodes, dropout 0.2,lr 0.01,batch size 128
2/2 [==============================] - 0s 0s/step - loss: 0.5627 - accuracy: 0.8462
32 nodes, dropout 0.2,lr 0.005,batch size 32
2/2 [==============================] - 0s 16ms/step - loss: 0.9494 - accuracy: 0.8462
32 nodes, dropout 0.2,lr 0.005,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 0.9690 - accuracy: 0.8462
32 nodes, dropout 0.2,lr 0.005,batch size 128
2/2 [==============================] - 0s 0s/step - loss: 0.6138 - accuracy: 0.8205
32 nodes, dropout 0.2,lr 0.001,batch size 32
2/2 [==============================] - 0s 0s/step - loss: 0.4329 - accuracy: 0.7949
32 nodes, dropout 0.2,lr 0.001,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 0.4305 - accuracy: 0.8462
32 nodes, dropout 0.2,lr 0.001,batch size 128
2/2 [==============================] - 0s 16ms/step - loss: 0.4066 - accuracy: 0.8205
64 nodes, dropout 0,lr 0.01,batch size 32
2/2 [==============================] - 0s 16ms/step - loss: 2.3440 - accuracy: 0.8205
64 nodes, dropout 0,lr 0.01,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 2.7491 - accuracy: 0.7692
64 nodes, dropout 0,lr 0.01,batch size 128
2/2 [==============================] - 0s 0s/step - loss: 1.4621 - accuracy: 0.8205
64 nodes, dropout 0,lr 0.005,batch size 32
2/2 [==============================] - 0s 0s/step - loss: 2.4739 - accuracy: 0.8462
64 nodes, dropout 0,lr 0.005,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 1.6316 - accuracy: 0.7949
64 nodes, dropout 0,lr 0.005,batch size 128
2/2 [==============================] - 0s 4ms/step - loss: 1.3957 - accuracy: 0.7949
64 nodes, dropout 0,lr 0.001,batch size 32
2/2 [==============================] - 0s 0s/step - loss: 0.7824 - accuracy: 0.8205
64 nodes, dropout 0,lr 0.001,batch size 64
2/2 [==============================] - 0s 16ms/step - loss: 0.4458 - accuracy: 0.8205
64 nodes, dropout 0,lr 0.001,batch size 128
2/2 [==============================] - 0s 0s/step - loss: 0.4083 - accuracy: 0.8205
64 nodes, dropout 0.2,lr 0.01,batch size 32
2/2 [==============================] - 0s 0s/step - loss: 2.1601 - accuracy: 0.8462
64 nodes, dropout 0.2,lr 0.01,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 1.6248 - accuracy: 0.8205
64 nodes, dropout 0.2,lr 0.01,batch size 128
2/2 [==============================] - 0s 0s/step - loss: 1.1621 - accuracy: 0.8462
64 nodes, dropout 0.2,lr 0.005,batch size 32
2/2 [==============================] - 0s 0s/step - loss: 1.3952 - accuracy: 0.8205
64 nodes, dropout 0.2,lr 0.005,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 1.0800 - accuracy: 0.8205
64 nodes, dropout 0.2,lr 0.005,batch size 128
2/2 [==============================] - 0s 0s/step - loss: 0.7528 - accuracy: 0.7949
64 nodes, dropout 0.2,lr 0.001,batch size 32
2/2 [==============================] - 0s 0s/step - loss: 0.4400 - accuracy: 0.8462
64 nodes, dropout 0.2,lr 0.001,batch size 64
2/2 [==============================] - 0s 0s/step - loss: 0.4205 - accuracy: 0.8718
64 nodes, dropout 0.2,lr 0.001,batch size 128
2/2 [==============================] - 0s 16ms/step - loss: 0.4396 - accuracy: 0.8462
In [ ]: